# data cleaning
qb_stats <- qb_stats %>%
select(-Position)
qb_stats <- qb_stats %>%
rename("Rating" = "Passer Rating")
basic_stats <- basic_stats %>%
rename("Team" = "Current Team")
basic_stats <- basic_stats %>%
rename("hsloc" = "High School Location")
basic_stats <- basic_stats %>%
rename("Weight" = "Weight (lbs)")
basic_stats <- basic_stats %>%
rename("Height" = "Height (inches)")
basic_stats <- basic_stats %>%
filter(Position == "QB")
qb_comb <- merge(qb_stats, basic_stats, by="Name")
qb_comb$`Passes Attempted`[which(qb_comb$`Passes Attempted` == '--')] = 0
qb_comb <- qb_comb %>%
mutate(`Passes Attempted` = as.numeric(`Passes Attempted`)) %>%
mutate(`Games Played` = as.numeric(`Games Played`))
qb_comb <- qb_comb %>% select(-c(Number, `Years Played`))
qb_comb <- qb_comb %>% mutate(Experience = as.numeric(stringr::str_extract(Experience,'[0-9]+')))
qb_comb <- qb_comb %>% mutate(hsState = stringr::str_sub(hsloc,-2,-1))
qb_comb <- qb_comb %>% mutate(Conference = c("PAC-12", "PAC-12", "PAC-12", "PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12",
"PAC-12", "PAC-12", "PAC-12", "PAC-12",
"AAC", "AAC", "AAC",
"Big 12",
"Big 12","Big 12","Big 12","Big 12","Big 12","Big 12",
"Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10",
"Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10",
"ACC", "ACC",
"ACC",
"Mountain West", "Mountain West", "Mountain West",
"PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12",
"PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12",
"Big 10",
"Big 10","Big 10","Big 10","Big 10","Big 10",
"SEC", "SEC", "SEC", "SEC", "SEC", "SEC", "SEC", "SEC", "SEC", "SEC", "SEC",
"Big 12", "Big 12", "Big 12", "Big 12", "Big 12", "Big 12",
"Big 12","Big 12","Big 12","Big 12","Big 12","Big 12","Big 12","Big 12",
"CUSA", "CUSA", "CUSA", "CUSA", "CUSA",
"Mountain West", "Mountain West", "Mountain West",
"Ivy","Ivy","Ivy","Ivy","Ivy","Ivy","Ivy","Ivy","Ivy","Ivy","Ivy","Ivy",
"Colonial Athletic Association","Colonial Athletic Association","Colonial Athletic Association","Colonial Athletic Association","Colonial Athletic Association","Colonial Athletic Association","Colonial Athletic Association","Colonial Athletic Association","Colonial Athletic Association",
"PAC-12","PAC-12","PAC-12","PAC-12","PAC-12",
"Big 12","Big 12","Big 12","Big 12","Big 12","Big 12",
"Ohio Valley Conference", "Ohio Valley Conference", "Ohio Valley Conference",
"ACC", "ACC", "ACC", "ACC",
"PAC-12",
"Big 12","Big 12","Big 12","Big 12","Big 12",
"Big 10","Big 10","Big 10","Big 10","Big 10","Big 10","Big 10","Big 10","Big 10",
"ACC", "ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC",
"PAC-12",
"Big 10","Big 10","Big 10","Big 10","Big 10","Big 10","Big 10","Big 10","Big 10",
"PAC-12","PAC-12",
"Pioneer Football League", "Pioneer Football League", "Pioneer Football League", "Pioneer Football League", "Pioneer Football League", "Pioneer Football League", "Pioneer Football League", "Pioneer Football League", "Pioneer Football League", "Pioneer Football League",
"Big 10",
"Big 12","Big 12","Big 12","Big 12",
"Mountain West","Mountain West","Mountain West","Mountain West","Mountain West","Mountain West",
"AAC","AAC","AAC","AAC","AAC",
"PAC-12",
"ACC","ACC","ACC","ACC","ACC","ACC",
"PAC-12","PAC-12","PAC-12","PAC-12","PAC-12",
"AAC",
"SEC","SEC","SEC","SEC","SEC","SEC","SEC",
"SEC","SEC","SEC","SEC","SEC","SEC","SEC","SEC","SEC","SEC","SEC","SEC","SEC",
"PAC-12","PAC-12",
"ACC", "ACC", "ACC", "ACC",
"PAC-12","PAC-12",
"SEC", "SEC", "SEC",
"Southland Conference", "Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference",
"CUSA", "CUSA", "CUSA", "CUSA","CUSA","CUSA","CUSA","CUSA","CUSA","CUSA","CUSA","CUSA","CUSA", "Big 12",
"Big 12","Big 12","Big 12","Big 12","Big 12","Big 12", "Big 10",
"Big 10","Big 10","Big 10","Mountain West",
"Mountain West", "Mountain West", "Mountain West", "PAC-12",
"PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","ACC",
"ACC", "ACC", "ACC", "SEC",
"SEC","SEC","SEC","SEC","SEC","AAC",
"AAC", "AAC","AAC","AAC","AAC","AAC","AAC","AAC","AAC","AAC","AAC","PAC-12",
"PAC-12","PAC-12","PAC-12","PAC-12","PAC-12",
"PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","Big 12",
"Big 12", "ACC",
"ACC","ACC","ACC","ACC","ACC","SEC",
"ACC",
"ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","PAC-12",
"PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","MAC",
"MAC", "MAC","MAC","MAC","MAC","MAC","MAC","MAC","MAC","MAC","MAC","MAC","ACC",
"ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","PAC-12",
"PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","ACC",
"ACC","SEC",
"Big 10",
"Big 10","SEC",
"SEC", "PAC-12",
"PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","Big 12","Big 12","Big 12","Big 12",
"SEC", "SEC", "SEC","SEC",
"SEC","SEC","SEC","SEC","Big 10","Big 10","Big 10","Big 10",
"Big 10","Big 10","Big 10","Big 10","Big 10","SEC","SEC","SEC","SEC",
"SEC", "Midwest Conference","Midwest Conference","Midwest Conference","ACC",
"ACC", "ACC","ACC",
"ACC","ACC","ACC","ACC","ACC","Big 10","Big 10","Big 10","Big 10",
"Big 10","Big 10","CUSA","CUSA","CUSA","CUSA",
"CUSA","CUSA","CUSA","Big 12","Big 12","Big 12","Big 12",
"Big 12","Big 12","Missouri Valley Conference","Big 10","Big 10","Big 10",
"Big 10",
"Big 10","ACC", "ACC", "ACC", "ACC", "ACC", "ACC", "ACC", "ACC"
))
#tester <- qb_comb %>% select(Name, College, Conference)
qb_logs <- qb_logs %>%
filter(Season != "Preseason", Outcome != "T", `Games Started` == 1) %>%
select(-c(Position, Week, `Game Date`, Score, `Games Played`, `Games Started`, Name, `Player Id`, Year, Season, Opponent, `Home or Away`))
qb_logs$Outcome <- as.factor(qb_logs$Outcome)
qb_logs <- qb_logs %>%
mutate(Outcome = relevel(Outcome, ref='L')) #set reference level to be L
qb_logs[is.na(qb_logs)] = 0 #replaces NA values with 0
# creation of cv folds
qbcomb_cv <- vfold_cv(qb_comb, v = 10)
# model spec
#OLS spec
lm_spec <-
linear_reg() %>%
set_engine(engine = 'lm') %>%
set_mode('regression')
#LASSO spec
lm_lasso_spec <-
linear_reg() %>%
set_args(mixture = 1, penalty = tune()) %>% ## mixture = 1 indicates Lasso, we'll choose penalty later
set_engine(engine = 'glmnet') %>%
set_mode('regression')
# recipes & workflows
#OLS recipe
lm_rec <- recipe(Rating ~ Height + Age + Weight + Experience + Conference + hsState + `Passes Attempted` + `Games Played`, data = qb_comb) %>%
update_role(`Passes Attempted`, new_role = 'Info') %>%
update_role(`Games Played`, new_role = 'Info') %>%
step_filter(`Passes Attempted` >= 50, `Games Played` > 0) %>% #removes potential outliers with few passes
step_lincomb(all_numeric_predictors()) %>% # removes predictors that are linear combos of others
step_corr(all_predictors()) %>% #removes highly correlated variables
step_dummy(all_nominal_predictors()) # creates indicator variables for categorical variables
#OLS workflow
lm_wf <- workflow() %>%
add_recipe(lm_rec) %>%
add_model(lm_spec)
#LASSO recipe
lasso_rec <- recipe(Rating ~ Height + Age + Weight + Experience + Conference + hsState + `Passes Attempted` + `Games Played`, data = qb_comb) %>%
update_role(`Passes Attempted` , new_role = 'Info') %>%
update_role(`Games Played` , new_role = 'Info') %>%
step_filter(`Passes Attempted` >= 50, `Games Played` > 0) %>% #removes potential outliers with few passes
step_nzv(all_predictors()) %>% # removes variables with the same value
step_novel(all_nominal_predictors()) %>% # important if you have rare categorical variables
step_normalize(all_numeric_predictors()) %>% # important standardization step for LASSO
step_dummy(all_nominal_predictors()) # creates indicator variables for categorical variables
ns3_lasso <- lasso_rec %>%
step_ns(Weight, deg_free = 3) %>% # natural cubic splines (higher deg_free means more knots)
step_ns(Height, deg_free = 3) %>%
step_ns(Age, deg_free = 3) %>%
step_ns(Experience, deg_free = 3) %>%
step_ns(`Games Played`, deg_free = 2)
#lasso_rec %>% prep(qb_comb) %>% juice()
#LASSO workflow
lasso_wf <- workflow() %>%
add_recipe(ns3_lasso) %>%
add_model(lm_lasso_spec)
# fit & tune LASSO model
lm_lasso_spec_tune <-
linear_reg() %>%
set_args(mixture = 1, penalty = tune()) %>% ## tune() indicates that we will try a variety of values
set_engine(engine = 'glmnet') %>%
set_mode('regression')
lasso_wf <- workflow() %>%
add_recipe(ns3_lasso) %>%
add_model(lm_lasso_spec_tune)
penalty_grid <- grid_regular(
penalty(range = c(-5, 3)), #log10 transformed 10^-5 to 10^3
levels = 50)
tune_res <- tune_grid( # new function for tuning hyperparameters
lasso_wf, # workflow
resamples = qbcomb_cv, # folds
metrics = metric_set(rmse),
grid = penalty_grid # penalty grid
)
autoplot(tune_res)
collect_metrics(tune_res) %>%
filter(.metric == 'rmse') %>%
select(penalty, rmse = mean)
## # A tibble: 50 × 2
## penalty rmse
## <dbl> <dbl>
## 1 0.00001 35.8
## 2 0.0000146 35.8
## 3 0.0000212 35.8
## 4 0.0000309 35.8
## 5 0.0000450 35.8
## 6 0.0000655 35.8
## 7 0.0000954 35.8
## 8 0.000139 35.8
## 9 0.000202 35.8
## 10 0.000295 35.8
## # … with 40 more rows
best_penalty <- select_best(tune_res, metric = 'rmse') # choose best penalty value
final_wf <- finalize_workflow(lasso_wf, best_penalty) # incorporates penalty value to workflow
final_fit <- fit(final_wf, data = qb_comb)
output <- tidy(final_fit)
output %>% arrange(desc(estimate)) #sort variables in the LASSO model by coefficient value (a way of measuring variable importance)
## # A tibble: 57 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) 80.1 0.256
## 2 Conference_Ohio.Valley.Conference 29.4 0.256
## 3 Games Played_ns_1 16.6 0.256
## 4 Games Played_ns_2 9.75 0.256
## 5 Experience_ns_3 7.02 0.256
## 6 hsState_HI 5.47 0.256
## 7 Conference_MAC 4.12 0.256
## 8 hsState_MS 3.11 0.256
## 9 Conference_ACC 2.98 0.256
## 10 hsState_OH 2.79 0.256
## # … with 47 more rows
###LASSO w/ splines CV Metrics
# calculate/collect CV metrics
mod1_cv <- fit_resamples(final_fit,
resamples = qbcomb_cv,
metrics = metric_set(rmse, rsq, mae)
)
mod1_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 24.3 10 0.944 Preprocessor1_Model1
## 2 rmse standard 35.5 10 1.08 Preprocessor1_Model1
## 3 rsq standard 0.284 10 0.0341 Preprocessor1_Model1
collect_metrics(tune_res) %>%
filter(.metric == "rmse") %>%
select(penalty, rmse = mean)
## # A tibble: 50 × 2
## penalty rmse
## <dbl> <dbl>
## 1 0.00001 35.8
## 2 0.0000146 35.8
## 3 0.0000212 35.8
## 4 0.0000309 35.8
## 5 0.0000450 35.8
## 6 0.0000655 35.8
## 7 0.0000954 35.8
## 8 0.000139 35.8
## 9 0.000202 35.8
## 10 0.000295 35.8
## # … with 40 more rows
# visual residuals
mod1_output <- qb_comb %>%
bind_cols(predict(final_fit, new_data = qb_comb)) %>%
mutate(resid = Rating - .pred)
#Height (non-linearity)
ggplot(mod1_output, aes(x = Height, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
#Age (some non-linearity)
ggplot(mod1_output, aes(x = Age, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
#Weight (non-linearity)
ggplot(mod1_output, aes(x = Weight, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
#Experience (non-linearity)
ggplot(mod1_output, aes(x = Experience, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
#College Conference
ggplot(mod1_output, aes(x = Conference, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
#High School State
ggplot(mod1_output, aes(x = hsState, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
#Passes Attempted (non-linearity)
ggplot(mod1_output, aes(x = `Passes Attempted`, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
#Games Played (non-linearity)
ggplot(mod1_output, aes(x = `Games Played`, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
<<<<<<< HEAD
#OLS Model w/o splines
lm_spec <-
linear_reg() %>%
set_engine(engine = 'lm') %>%
set_mode('regression')
#OLS recipe
lm_rec <- recipe(Rating ~ Height + Age + Weight + Experience + Conference + hsState + `Passes Attempted` + `Games Played`, data = qb_comb) %>%
update_role(`Passes Attempted`, new_role = 'Info') %>%
update_role(`Games Played`, new_role = 'Info') %>%
step_filter(`Passes Attempted` >= 50, `Games Played` > 0) %>% #removes potential outliers with few passes
#step_lincomb(all_numeric_predictors()) %>% # removes predictors that are linear combos of others
#step_corr(all_numeric_predictors()) %>% #removes highly correlated variables
step_other(all_nominal_predictors(),threshold = .05) %>%
step_dummy(all_nominal_predictors()) # creates indicator variables for categorical variables
#OLS workflow
lm_wf <- workflow() %>%
add_recipe(lm_rec) %>%
add_model(lm_spec)
fit(lm_wf, data = qb_comb) %>% tidy()
## # A tibble: 14 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 185. 45.2 4.10 0.0000534
## 2 Height -1.11 0.694 -1.60 0.111
## 3 Age -1.37 0.798 -1.72 0.0864
## 4 Weight 0.0960 0.108 0.893 0.372
## 5 Experience 1.62 0.786 2.07 0.0395
## 6 Conference_Big.10 -4.74 3.09 -1.54 0.126
## 7 Conference_Big.12 -8.97 3.37 -2.66 0.00824
## 8 Conference_PAC.12 -11.2 3.23 -3.48 0.000582
## 9 Conference_SEC -5.26 2.98 -1.77 0.0780
## 10 Conference_other -9.79 2.85 -3.43 0.000678
## 11 hsState_OH 2.70 4.11 0.657 0.511
## 12 hsState_PA -7.11 4.36 -1.63 0.104
## 13 hsState_TX -4.00 2.73 -1.46 0.144
## 14 hsState_other -6.94 2.65 -2.62 0.00912
#calculate/collect CV metrics
lm_cv <- fit_resamples(lm_wf,
resamples = qbcomb_cv,
metrics = metric_set(rmse, rsq, mae))
lm_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 26.9 10 0.915 Preprocessor1_Model1
## 2 rmse standard 39.6 10 1.02 Preprocessor1_Model1
## 3 rsq standard 0.0507 10 0.0127 Preprocessor1_Model1
#OLS w/o splines is far worse than incorporating natural cubic splines into the model
#qb_comb %>% purrr::map(~sum(is.na(.)))
lm_spec <-
linear_reg() %>%
set_engine(engine = 'lm') %>%
set_mode('regression')
#OLS recipe
lm_rec <- recipe(Rating ~ Height + Age + Weight + Experience + Conference + hsState + `Passes Attempted` + `Games Played`, data = qb_comb) %>%
update_role(`Passes Attempted`, new_role = 'Info') %>%
update_role(`Games Played`, new_role = 'Info') %>%
step_filter(`Passes Attempted` >= 50, `Games Played` > 0) %>% #removes potential outliers with few passes
#step_lincomb(all_numeric_predictors()) %>% # removes predictors that are linear combos of others
#step_corr(all_numeric_predictors()) %>% #removes highly correlated variables
step_other(all_nominal_predictors(),threshold = .05) %>%
step_dummy(all_nominal_predictors()) # creates indicator variables for categorical variables
ns3_OLS <- lm_rec %>%
step_ns(Weight, deg_free = 3) %>% # natural cubic splines (higher deg_free means more knots)
step_ns(Height, deg_free = 3) %>%
step_ns(Age, deg_free = 3) %>%
step_ns(Experience, deg_free = 3) %>%
step_ns(`Games Played`, deg_free = 2)
#OLS workflow
splines_wf <- workflow() %>%
add_recipe(ns3_OLS) %>%
add_model(lm_spec)
fit(splines_wf, data = qb_comb) %>% tidy()
## # A tibble: 24 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 83.7 10.1 8.25 4.87e-15
## 2 Conference_Big.10 -7.95 3.37 -2.36 1.89e- 2
## 3 Conference_Big.12 -5.88 3.25 -1.81 7.18e- 2
## 4 Conference_PAC.12 -3.40 3.77 -0.900 3.69e- 1
## 5 Conference_SEC -1.55 3.18 -0.487 6.26e- 1
## 6 Conference_other -5.37 2.91 -1.85 6.60e- 2
## 7 hsState_OH 9.04 4.44 2.04 4.23e- 2
## 8 hsState_PA 1.69 4.81 0.352 7.25e- 1
## 9 hsState_TX -2.51 2.98 -0.843 4.00e- 1
## 10 hsState_other -2.96 3.15 -0.940 3.48e- 1
## # … with 14 more rows
#calculate/collect CV metrics
splines_cv <- fit_resamples(splines_wf,
resamples = qbcomb_cv,
metrics = metric_set(rmse, rsq, mae))
splines_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 24.4 10 0.901 Preprocessor1_Model1
## 2 rmse standard 35.7 10 1.06 Preprocessor1_Model1
## 3 rsq standard 0.271 10 0.0319 Preprocessor1_Model1
logistic_mod <- logistic_reg() %>%
set_engine("glm") %>%
set_mode('classification')
outcome_rec <- recipe(Outcome ~ ., data = qb_logs) %>%
step_normalize(all_numeric_predictors())
logistic_mod_fit <- workflow() %>%
add_model(logistic_mod) %>%
add_recipe(outcome_rec) %>%
fit(data = qb_logs)
logistic_mod_fit %>%
tidy() # Model Coefficients from Trained Model
## # A tibble: 17 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.0869 0.0267 3.25 1.15e- 3
## 2 `Passes Completed` 0.418 0.152 2.76 5.81e- 3
## 3 `Passes Attempted` -1.02 0.116 -8.82 1.18e- 18
## 4 `Completion Percentage` -0.136 0.0861 -1.58 1.15e- 1
## 5 `Passing Yards` 0.289 0.0999 2.89 3.81e- 3
## 6 `Passing Yards Per Attempt` -0.0444 0.0672 -0.661 5.08e- 1
## 7 `TD Passes` 0.303 0.0639 4.75 2.06e- 6
## 8 Ints -0.301 0.0660 -4.57 4.98e- 6
## 9 Sacks -0.154 0.0607 -2.53 1.14e- 2
## 10 `Sacked Yards Lost` -0.272 0.0606 -4.48 7.35e- 6
## 11 `Passer Rating` 0.639 0.135 4.75 2.03e- 6
## 12 `Rushing Attempts` 1.12 0.0483 23.1 1.68e-118
## 13 `Rushing Yards` -0.790 0.0597 -13.2 4.92e- 40
## 14 `Yards Per Carry` -0.0491 0.0415 -1.18 2.37e- 1
## 15 `Rushing TDs` 0.0741 0.0291 2.54 1.10e- 2
## 16 Fumbles -0.227 0.0365 -6.23 4.52e- 10
## 17 `Fumbles Lost` -0.164 0.0359 -4.59 4.53e- 6
library(dotwhisker)
tidy(logistic_mod_fit) %>% # Viz of Trained Model Odds Ratios
mutate(conf.low = exp(estimate - 1.96*std.error),conf.high = exp(estimate + 1.96*std.error)) %>% # do this first
mutate(estimate = exp(estimate)) %>% # this second
dwplot(dot_args = list(size = 2, color = "black"),
whisker_args = list(color = "black"),
vline = geom_vline(xintercept = 1, color = "grey50", linetype = 2)) +
labs(x = 'Odds Ratio') +
theme_classic()
# Soft Predictions: Predicted Probabilities
logistic_output <- qb_logs %>%
bind_cols(predict(logistic_mod_fit, new_data = qb_logs, type = 'prob'))
logistic_output %>% head()
## # A tibble: 6 × 19
## Outcome `Passes Completed` `Passes Attempte… `Completion Perc… `Passing Yards`
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 W 18 29 62.1 176
## 2 L 5 8 62.5 25
## 3 L 11 28 39.3 154
## 4 L 19 36 52.8 230
## 5 W 13 22 59.1 142
## 6 L 4 13 30.8 67
## # … with 14 more variables: Passing Yards Per Attempt <dbl>, TD Passes <dbl>,
## # Ints <dbl>, Sacks <dbl>, Sacked Yards Lost <dbl>, Passer Rating <dbl>,
## # Rushing Attempts <dbl>, Rushing Yards <dbl>, Yards Per Carry <dbl>,
## # Rushing TDs <dbl>, Fumbles <dbl>, Fumbles Lost <dbl>, .pred_L <dbl>,
## # .pred_W <dbl>
# Visualize predicted probabilities as a function of true outcome
logistic_output %>%
ggplot(aes(x = Outcome, y = .pred_W)) +
geom_boxplot() +
geom_hline(yintercept = 0.5, color='red') +
labs(y = 'Predicted Probability of Winning', x = 'Observed Outcome (L or W)') +
theme_classic()
logistic_output %>%
ggplot(aes(x = Outcome, y = .pred_L)) +
geom_boxplot() +
geom_hline(yintercept = 0.5, color='red') +
labs(y = 'Predicted Probability of LOSING', x = 'Observed Outcome (L or W)') +
theme_classic()
logistic_output <- logistic_output %>%
mutate(.pred_class = make_two_class_pred(.pred_L, levels(Outcome), threshold = .5)) #77.1 accuracy
#head(logistic_output)
#logistic_output %>%
# count(Outcome, .pred_class)
logistic_output %>%
conf_mat(truth = Outcome, estimate = .pred_class)
## Truth
## Prediction L W
## L 3428 1038
## W 1072 3691
set.seed(1512)
qblogs_cv10 <- vfold_cv(qb_logs, v=10)
# CV Fit Model
log_cv_fit <- fit_resamples(
logistic_mod_fit,
resamples = qblogs_cv10,
metrics = metric_set(sens, yardstick::spec, accuracy, roc_auc),
control = control_resamples(save_pred = TRUE, event_level = 'second')) # you need predictions for ROC calculations
collect_metrics(log_cv_fit) #default threshold is 0.5
## # A tibble: 4 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.771 10 0.00420 Preprocessor1_Model1
## 2 roc_auc binary 0.850 10 0.00483 Preprocessor1_Model1
## 3 sens binary 0.782 10 0.00630 Preprocessor1_Model1
## 4 spec binary 0.760 10 0.00428 Preprocessor1_Model1
outcome_rf_rec <- recipe(Outcome ~ ., data = qb_logs) %>%
step_normalize(all_numeric_predictors())
rf_spec <- rand_forest() %>%
set_engine(engine = 'ranger') %>%
set_args(mtry = NULL, # size of random subset of variables; default is floor(sqrt(ncol(x)))
trees = 100, # Number of bags
min_n = 5,
probability = FALSE, # want hard predictions first
importance = 'impurity') %>%
set_mode('classification') # change this for regression tree
rf_spec
## Random Forest Model Specification (classification)
##
## Main Arguments:
## trees = 100
## min_n = 5
##
## Engine-Specific Arguments:
## probability = FALSE
## importance = impurity
##
## Computational engine: ranger
outcome_rf_wf <- workflow() %>%
add_model(rf_spec) %>%
add_recipe(outcome_rf_rec)
set.seed(123)
outcome_rf_fit <- outcome_rf_wf %>%
fit(data = qb_logs)
outcome_rf_fit # check out OOB prediction error (accuracy = 1 - OOB prediction error)
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
##
## • step_normalize()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~100, min.node.size = min_rows(~5, x), probability = ~FALSE, importance = ~"impurity", num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
##
## Type: Classification
## Number of trees: 100
## Sample size: 9229
## Number of independent variables: 16
## Mtry: 4
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 22.74 %
outcome_rf_OOB_output <- tibble(
.pred_class = outcome_rf_fit %>% extract_fit_engine() %>% pluck('predictions'),
Outcome = qb_logs %>% pull(Outcome))
bag_metrics <- metric_set(sens, yardstick::spec, accuracy)
outcome_rf_OOB_output %>%
bag_metrics(truth = Outcome, estimate = .pred_class)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.779
## 2 spec binary 0.767
## 3 accuracy binary 0.773
set.seed(123) #to get the same bootstrap samples, use same seed
outcome_rf_fit2 <- outcome_rf_wf %>%
update_model(rf_spec %>% set_args(probability = TRUE)) %>%
fit(data = qb_logs)
outcome_rf_fit2
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
##
## • step_normalize()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~100, min.node.size = min_rows(~5, x), probability = ~TRUE, importance = ~"impurity", num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
##
## Type: Probability estimation
## Number of trees: 100
## Sample size: 9229
## Number of independent variables: 16
## Mtry: 4
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error (Brier s.): 0.1569944
outcome_rf_OOB_output2 <- bind_cols(
outcome_rf_fit2 %>% extract_fit_engine() %>% pluck('predictions') %>% as_tibble(),
qb_logs %>% select(Outcome))
outcome_rf_OOB_output2 %>%
roc_curve(Outcome, W, event_level = "second") %>% autoplot()
outcome_rf_OOB_output2 %>%
roc_auc(Outcome, W, event_level = "second") #Area under Curve
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.851
library(vip) #install.packages('vip')
outcome_rf_fit %>% extract_fit_engine() %>% vip() #based on impurity
outcome_rf_wf %>% #based on permutation
update_model(rf_spec %>% set_args(importance = "permutation")) %>%
fit(data = qb_logs) %>% extract_fit_engine() %>% vip()
collect_metrics(log_cv_fit) #CV Logistic Model
## # A tibble: 4 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.771 10 0.00420 Preprocessor1_Model1
## 2 roc_auc binary 0.850 10 0.00483 Preprocessor1_Model1
## 3 sens binary 0.782 10 0.00630 Preprocessor1_Model1
## 4 spec binary 0.760 10 0.00428 Preprocessor1_Model1
outcome_rf_OOB_output %>% #RF OOB Accuracy
bag_metrics(truth = Outcome, estimate = .pred_class)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.779
## 2 spec binary 0.767
## 3 accuracy binary 0.773
outcome_rf_OOB_output2 %>% #RF AUC
roc_auc(Outcome, W, event_level = "second")
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.851
# Select the variables to be used in clustering
qb_logs_sub <- qb_logs %>%
select(-Outcome)
# Summary statistics for the variables
summary(qb_logs_sub)
## Passes Completed Passes Attempted Completion Percentage Passing Yards
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.0
## 1st Qu.:15.00 1st Qu.:25.00 1st Qu.: 53.70 1st Qu.:168.0
## Median :19.00 Median :32.00 Median : 60.70 Median :221.0
## Mean :19.14 Mean :31.57 Mean : 60.38 Mean :222.1
## 3rd Qu.:23.00 3rd Qu.:38.00 3rd Qu.: 67.60 3rd Qu.:277.0
## Max. :43.00 Max. :69.00 Max. :100.00 Max. :527.0
## Passing Yards Per Attempt TD Passes Ints Sacks
## Min. : 0.000 Min. :0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 5.700 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.: 1.000
## Median : 6.900 Median :1.000 Median :1.0000 Median : 2.000
## Mean : 7.082 Mean :1.349 Mean :0.9157 Mean : 2.094
## 3rd Qu.: 8.200 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.: 3.000
## Max. :66.000 Max. :7.000 Max. :7.0000 Max. :11.000
## Sacked Yards Lost Passer Rating Rushing Attempts Rushing Yards
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. :-31.00
## 1st Qu.: 4.00 1st Qu.: 64.80 1st Qu.: 1.000 1st Qu.: 0.00
## Median :11.00 Median : 84.30 Median : 2.000 Median : 5.00
## Mean :13.51 Mean : 84.13 Mean : 2.622 Mean : 10.09
## 3rd Qu.:20.00 3rd Qu.:103.60 3rd Qu.: 4.000 3rd Qu.: 15.00
## Max. :91.00 Max. :158.30 Max. :22.000 Max. :181.00
## Yards Per Carry Rushing TDs Fumbles Fumbles Lost
## Min. :-26.000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 2.000 Median :0.00000 Median :0.0000 Median :0.0000
## Mean : 2.969 Mean :0.09459 Mean :0.5383 Mean :0.2234
## 3rd Qu.: 5.000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. : 44.000 Max. :3.00000 Max. :6.0000 Max. :3.0000
# Compute a distance matrix on the scaled data
dist_mat_scaled <- dist(scale(qb_logs_sub))
# The (scaled) distance matrix is the input to hclust()
# The method argument indicates the linkage type
hc_complete <- hclust(dist_mat_scaled, method = "complete")
hc_single <- hclust(dist_mat_scaled, method = "single")
hc_average <- hclust(dist_mat_scaled, method = "average")
hc_centroid <- hclust(dist_mat_scaled, method = "centroid")
# Plot dendrograms
plot(hc_complete)
plot(hc_single)
plot(hc_average)
plot(hc_centroid)
qb_logs <- qb_logs %>%
mutate(
hclust_height3 = factor(cutree(hc_complete, h = 3)), # Cut at height (h) 3
<<<<<<< HEAD
hclust_num6 = factor(cutree(hc_complete, k = 3)) # Cut into 3 clusters (k)
)
plot(hc_complete, labels = qb_logs$hclust_height3)
plot(hc_complete, labels = qb_logs$hclust_num6)
plot(hc_complete, labels = qb_logs$hclust_num6)
mat <- qb_logs %>%
select(-Outcome,-hclust_height3,-hclust_num6) %>%
as.matrix()
pca_out <- prcomp(mat, center = TRUE, scale = TRUE)
pca_out %>% pluck('rotation') %>% head()
## PC1 PC2 PC3 PC4
## Passes Completed 0.3716850 -0.27288303 0.18577370 -0.23266956
## Passes Attempted 0.2368035 -0.35424044 0.24246876 -0.34715781
## Completion Percentage 0.3777490 0.06715226 -0.07504039 0.16610358
## Passing Yards 0.4304861 -0.21129937 0.11267792 -0.13944330
## Passing Yards Per Attempt 0.3592990 0.10863208 -0.13577730 0.22293786
## TD Passes 0.3696420 0.03053012 -0.02378479 0.01901386
## PC5 PC6 PC7 PC8
## Passes Completed 0.013526852 0.044247834 -0.24386898 0.05570391
## Passes Attempted 0.041116903 -0.006248729 -0.27398074 -0.02432124
## Completion Percentage -0.058537147 0.118479501 0.06141751 0.19643588
## Passing Yards 0.019214760 0.040710125 0.05196286 -0.02370818
## Passing Yards Per Attempt -0.029852332 0.085558698 0.48760486 0.03584002
## TD Passes 0.008185073 -0.266913398 0.18212157 -0.24531741
## PC9 PC10 PC11 PC12
## Passes Completed -0.19037221 -0.1266949 0.0002363081 0.003278444
## Passes Attempted 0.05026027 0.1164014 0.0020067820 0.009255326
## Completion Percentage -0.59303574 -0.4860531 0.0073397576 0.009831694
## Passing Yards 0.06211819 0.4021595 -0.0045303117 -0.001038738
## Passing Yards Per Attempt -0.07152699 0.5435094 0.0004705980 0.015312213
## TD Passes 0.62608881 -0.4412705 0.0033561191 -0.013089873
## PC13 PC14 PC15 PC16
## Passes Completed -0.006028635 -0.006319295 0.19478709 0.73736015
## Passes Attempted 0.005755979 -0.571474156 -0.15821416 -0.44037360
## Completion Percentage -0.013447817 0.030583562 0.17017476 -0.37627986
## Passing Yards 0.000403107 0.708455495 -0.02319586 -0.25405675
## Passing Yards Per Attempt 0.008272566 -0.408798164 0.24569647 0.12412746
## TD Passes 0.008883324 -0.032655275 0.32278200 -0.06799853
pca_out_plot <- pca_out %>% pluck('x') %>% as.data.frame() %>% select(PC1,PC2) %>% bind_cols(
tibble(cluster = qb_logs$hclust_num6, Outcome = qb_logs$Outcome)
<<<<<<< HEAD
)
=======
)
>>>>>>> 4cb15c961a356ae39560bdf1c39d66e68a4a483f
pca_out %>% head()
pca_out$rotation %>% as.data.frame() %>% select(PC1) %>%
#abs() %>%
arrange(desc(abs(PC1)))
## PC1
## Passing Yards 0.43048614
## Passer Rating 0.42219398
## Completion Percentage 0.37774897
## Passes Completed 0.37168501
## TD Passes 0.36964203
## Passing Yards Per Attempt 0.35929901
## Passes Attempted 0.23680352
## Ints -0.11354133
## Sacks -0.08666869
## Sacked Yards Lost -0.08394013
## Fumbles -0.05532546
## Fumbles Lost -0.03497970
## Yards Per Carry -0.02796717
## Rushing TDs 0.02396848
## Rushing Yards -0.02123042
## Rushing Attempts 0.01555846
pca_out$rotation %>% as.data.frame() %>% select(PC2) %>%
#abs() %>%
arrange(desc(abs(PC2)))
## PC2
## Sacks -0.43922376
## Sacked Yards Lost -0.42991899
## Passes Attempted -0.35424044
## Fumbles -0.33962177
## Fumbles Lost -0.30736245
## Passes Completed -0.27288303
## Passing Yards -0.21129937
## Ints -0.21103781
## Rushing Yards -0.18281352
## Yards Per Carry -0.17824523
## Passer Rating 0.15852117
## Rushing Attempts -0.11793770
## Passing Yards Per Attempt 0.10863208
## Completion Percentage 0.06715226
## Rushing TDs -0.04276507
## TD Passes 0.03053012
pca_out %>%
pluck('x') %>%
as.data.frame() %>%
mutate(labels = qb_logs$Outcome) %>%
ggplot(aes(x = PC1, y = PC2, color = factor(labels))) +
geom_point() +
labs(x = 'PC1', y = 'PC2') +
scale_color_viridis_d() +
theme_classic()
<<<<<<< HEAD
pca_out %>%
pluck('x') %>%
as.data.frame() %>%
mutate(labels = qb_logs$hclust_num6) %>%
ggplot(aes(x = PC1, y = PC2, color = factor(labels))) +
geom_point() +
labs(x = 'PC1', y = 'PC2') +
scale_color_viridis_d() +
theme_classic()
<<<<<<< HEAD
var_explained <- (pca_out %>% pluck('sdev'))^2
pve <- var_explained/sum(var_explained)
var_data <- tibble(
PC = seq_len(length(var_explained)),
var_explained = var_explained,
pve = pve
)
# Construct scree plots
p1 <- var_data %>%
ggplot(aes(x = PC, y = pve)) +
geom_point() +
geom_line() +
labs(x = 'Principal Component', y = 'Proportion of varinace explained') +
theme_classic()
p2 <- var_data %>%
ggplot(aes(x = PC, y = cumsum(pve))) +
geom_point() +
geom_line() +
labs(x = 'Principal Component', y = 'Cumulative proportion of variance explained') +
theme_classic()
library(ggpubr)
ggarrange(p1, p2)